{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyNXBc0ZTxbtr63I2LiiPsTA",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/sugarforever/LangChain-Tutorials/blob/main/langchain_nomic_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "1O1vgh-veZzX"
      },
      "outputs": [],
      "source": [
        "!pip install -q -U langchain chromadb tiktoken langchain-nomic langchain-openai"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "POST_URL=\"https://blog.nomic.ai/posts/nomic-embed-text-v1\""
      ],
      "metadata": {
        "id": "LCA0Sqote-__"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain_community.document_loaders import WebBaseLoader\n",
        "\n",
        "docs = WebBaseLoader(POST_URL).load()"
      ],
      "metadata": {
        "id": "AUCuhl1NfA2V"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(docs)"
      ],
      "metadata": {
        "id": "aXUyW9stfWav"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.text_splitter import CharacterTextSplitter\n",
        "\n",
        "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
        "    chunk_size=7500, chunk_overlap=100\n",
        ")\n",
        "doc_splits = text_splitter.split_documents(docs)"
      ],
      "metadata": {
        "id": "67ozgMXqfXIt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "from google.colab import userdata\n",
        "\n",
        "os.environ['NOMIC_API_KEY'] = userdata.get('NOMIC_API_KEY')\n",
        "\n",
        "from langchain_community.vectorstores import Chroma\n",
        "from langchain_core.output_parsers import StrOutputParser\n",
        "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
        "from langchain_nomic import NomicEmbeddings\n",
        "from langchain_nomic.embeddings import NomicEmbeddings"
      ],
      "metadata": {
        "id": "bgjXz_GwgV7A"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "vectorstore = Chroma.from_documents(\n",
        "    documents=doc_splits,\n",
        "    embedding=NomicEmbeddings(model=\"nomic-embed-text-v1\"),\n",
        ")\n",
        "retriever = vectorstore.as_retriever()"
      ],
      "metadata": {
        "id": "hBvJESZEgAH1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "os.environ['OPENAI_API_KEY'] = userdata.get(\"OPENAI_API_KEY\")"
      ],
      "metadata": {
        "id": "cjSzc74xhKHU"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain_community.chat_models import ChatOllama\n",
        "from langchain_core.prompts import ChatPromptTemplate\n",
        "from langchain_openai import ChatOpenAI\n",
        "\n",
        "# Prompt\n",
        "template = \"\"\"Answer the question based only on the following context:\n",
        "{context}\n",
        "\n",
        "Question: {question}\n",
        "\"\"\"\n",
        "prompt = ChatPromptTemplate.from_template(template)\n",
        "\n",
        "chain = (\n",
        "    {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
        "    | prompt\n",
        "    | ChatOpenAI(temperature=0, model=\"gpt-4-1106-preview\")\n",
        "    | StrOutputParser()\n",
        ")\n"
      ],
      "metadata": {
        "id": "dMjjps1WgyGH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "response = chain.invoke(\"How did Nomic Embed get trained?\")"
      ],
      "metadata": {
        "id": "ADYQ7q-yhhDO"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "response"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 183
        },
        "id": "mdasttaxhs3s",
        "outputId": "cbe76cbe-0b55-4d94-e7d1-1f63af7b71cd"
      },
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\"Nomic Embed was trained using a multi-stage contrastive learning pipeline. The process began with a BERT initialization, specifically training their own BERT model with a 2048 token context length, named nomic-bert-2048. This model incorporated several modifications inspired by MosaicBERT, such as using Rotary Position Embeddings for context length extrapolation, employing SwiGLU activations for improved performance, setting dropout to 0, and implementing various training optimizations like using Deepspeed and FlashAttention, training in BF16 precision, increasing the vocab size to a multiple of 64, training with a large batch size of 4096, and masking at a 30% rate during masked language modeling.\\n\\nAfter establishing the nomic-bert-2048, the next phase involved contrastive training with a dataset composed of approximately 235 million text pairs. This dataset was extensively validated for quality during collection with Nomic Atlas. The details of the dataset can be found in the nomic-ai/contrastors codebase, and a subset of 5 million pairs is available for exploration in Nomic Atlas.\\n\\nThe contrastive training aimed to fine-tune the model on high-quality labeled datasets, leveraging search queries and answers from web searches, and employing data curation and hard-example mining techniques. The model's performance was evaluated on benchmarks like the GLUE benchmark, the Massive Text Embedding Benchmark (MTEB), the LoCo Benchmark, and the Jina Long Context Benchmark, where it demonstrated superior performance compared to other models like OpenAI's text-embedding-ada-002 and jina-embeddings-v2-base-en on various tasks.\""
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "7xqm1DsjhtNv"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}